# loading data
kang_seed <- read.table(here("data/sev208_kratseedbank_20120213.txt"), sep=',', header = TRUE) |>
mutate(mnd = as.factor(mnd)) |>
filter(!(species %in% c("soil", "plant", "dist", "litter", "gravel")))
gg_miss_var(kang_seed)
skim(kang_seed)
| Name | kang_seed |
| Number of rows | 960 |
| Number of columns | 5 |
| _______________________ | |
| Column type frequency: | |
| character | 3 |
| factor | 1 |
| numeric | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| dir | 0 | 1 | 1 | 1 | 0 | 4 | 0 |
| loc | 0 | 1 | 1 | 1 | 0 | 4 | 0 |
| species | 0 | 1 | 4 | 6 | 0 | 8 | 0 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| mnd | 0 | 1 | FALSE | 10 | 18: 128, 23: 128, 25: 128, 6: 96 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| seeds | 0 | 1 | 11.1 | 48.21 | 0 | 0 | 0 | 3 | 656 | ▇▁▁▁▁ |
# visualize counts
ggplot(kang_seed, aes(x = seeds)) +
geom_histogram(bins = 17)
#visualize using box plot
ggplot(data = kang_seed, aes(x = mnd, y = seeds)) +
geom_boxplot() +
labs(x = "Mound Number", y = "NUmber of Seeds", title = "Boxplot of Seeds Found at Different Mounds") +
theme_bw()
# set up data for bar graph
kang_seed_bar <- kang_seed |>
group_by(mnd) |>
summarise(across(seeds, sum)) |>
ungroup()
# bar graph
ggplot() +
geom_bar(data = kang_seed_bar, aes(x = mnd, y = seeds, fill = mnd),
stat = "identity", show.legend = F) +
theme_bw() +
labs(x = "Mound Number", y = "Number of Seeds", title = "Total Seed Count per Mound")
Question: How does total seed number differ between kangaroo rat mound locations?
We will be performing an ANOVA test to determine if there is a difference in the total seed count between kangaroo rat mound locations.
\(H_0:\) There is no significant difference in seed counts between the kangaroo rat mound locations. \(H_a:\) There is a significant difference in seed counts between at least two of the kangaroo mound locations.
# make aov object
kang_aov <- aov(seeds ~ mnd, data = kang_seed)
# get resiudals
kang_res <- kang_aov$residuals
# check normality
qqPlot(kang_res)
## [1] 899 195
The data does not appear to be normal, which we expceted because in the context reading they discussed how in their analysis the data was non-normal even after trying transformations. We decided to try a log transformation regaudless because this would be the logical next step in this analysis.
# transform data
kang_seed_log <- kang_seed |>
mutate(seeds = log(seeds + 1))
# make new aov object
kang_log_aov <- aov(seeds ~ mnd, data = kang_seed_log)
# residuals
kang_log_res <- kang_log_aov$residuals
#test normality
qqPlot(kang_log_res)
## [1] 899 195
The data is still non-normal as expected (we mentionded above that the context paper mentioned how the data is not normal)
# show aov summary
summary(kang_aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## mnd 9 15897 1766 0.758 0.655
## Residuals 950 2212569 2329
The ANOVA p-value is greater than 0.05, meaning we fail to reject the null hypothesis that there is no significant difference in total seed count between different kangaroo rat mounds.
# loading data
shrub_raw <- read.csv(here("data/shrubstudy_seed.csv"))
shrub_seed <- shrub_raw |>
dplyr::select(treatment, species, total_inf = total_nr_infl,
num_seeds = nr_seeds, shrub_num, plant_num = plant_nr, tag_num) |>
mutate(treatment = as.factor(treatment))
# visualize missing data
gg_miss_var(shrub_seed)
skim(shrub_seed)
| Name | shrub_seed |
| Number of rows | 287 |
| Number of columns | 7 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| factor | 1 |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| species | 0 | 1 | 6 | 6 | 0 | 19 | 0 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| treatment | 0 | 1 | FALSE | 2 | con: 174, shr: 113 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| total_inf | 0 | 1.00 | 5.15 | 10.45 | 1 | 1.00 | 1 | 5.00 | 117 | ▇▁▁▁▁ |
| num_seeds | 105 | 0.63 | 14.55 | 28.62 | 0 | 1.25 | 5 | 13.75 | 285 | ▇▁▁▁▁ |
| shrub_num | 0 | 1.00 | 29.53 | 16.04 | 5 | 13.00 | 30 | 44.00 | 54 | ▆▁▇▂▅ |
| plant_num | 0 | 1.00 | 2.43 | 1.34 | 1 | 1.00 | 2 | 3.00 | 5 | ▇▆▅▃▂ |
| tag_num | 0 | 1.00 | 143.48 | 63.71 | 22 | 104.00 | 159 | 181.00 | 300 | ▃▂▇▂▁ |
# filter out missing data
shrub_seed_sub <- shrub_seed |>
drop_na(num_seeds)
ggplot(shrub_seed_sub, aes(x = num_seeds)) +
geom_histogram(bins = 17)
# calc Pearson's r for numerical values
shrub_seed_sub_cor <- shrub_seed_sub |>
dplyr::select(3:4) |>
cor(method = "pearson")
# plot correlation values
corrplot(shrub_seed_sub_cor,
method = "ellipse",
addCoef.col = "black")
shrub_seed_sub |>
dplyr::select(!num_seeds) |>
ggpairs()
Question: How does seed count vary with plot type (shrub or open), plant species, and total number of inflorescences? Is there a simpler model that explains seed count, and if so, what is it?
# linear model, we know this is wrong
shrub_model1 <- lm(num_seeds ~ treatment + species + total_inf, data = shrub_seed_sub)
# generalized linear model with Poisson distribution
shrub_model2 <- glm(num_seeds ~ treatment + species + total_inf, data = shrub_seed_sub, family = "poisson")
# generalized linear model with negative binomial distribution
shrub_model3 <- glmmTMB(num_seeds ~ treatment + species + total_inf, data = shrub_seed_sub, family = "nbinom2")
# generalized linear model with Poisson distribution and random effect of treatment
shrub_model4 <- glmmTMB(num_seeds ~ treatment + species + total_inf + (1|shrub_num),
data = shrub_seed_sub, family = "poisson")
# generalized linear model with negative binomial distribution and random effect of treatment
shrub_model5 <- glmmTMB(num_seeds ~ treatment + species + total_inf + (1|shrub_num),
data = shrub_seed_sub, family = "nbinom2")
Because we are looking at count data, we know that the data is discrete and only has a lower bound. Knowing this, we built a couple different models using the Poisson and Negative Binomial distribution.
# check diagnostics
simulationOutput_m1 <- simulateResiduals(shrub_model1)
plot(simulationOutput_m1)
## qu = 0.25, log(sigma) = -2.738927 : outer Newton did not converge fully.
## qu = 0.75, log(sigma) = -2.138039 : outer Newton did not converge fully.
par(mfrow = c(1,2))
testDispersion(simulationOutput_m1)
##
## DHARMa nonparametric dispersion test via sd of residuals fitted vs.
## simulated
##
## data: simulationOutput
## dispersion = 0.95995, p-value = 0.64
## alternative hypothesis: two.sided
testZeroInflation(simulationOutput_m1)
##
## DHARMa zero-inflation test via comparison to expected zeros with
## simulation under H0 = fitted model
##
## data: simulationOutput
## ratioObsSim = Inf, p-value < 2.2e-16
## alternative hypothesis: two.sided
# check diagnostics
simulationOutput_m2 <- simulateResiduals(shrub_model2)
plot(simulationOutput_m2)
## qu = 0.5, log(sigma) = -2.296452 : outer Newton did not converge fully.
par(mfrow = c(1,2))
testDispersion(simulationOutput_m2)
##
## DHARMa nonparametric dispersion test via sd of residuals fitted vs.
## simulated
##
## data: simulationOutput
## dispersion = 20.946, p-value < 2.2e-16
## alternative hypothesis: two.sided
testZeroInflation(simulationOutput_m2)
##
## DHARMa zero-inflation test via comparison to expected zeros with
## simulation under H0 = fitted model
##
## data: simulationOutput
## ratioObsSim = 3.8259, p-value < 2.2e-16
## alternative hypothesis: two.sided
simulationOutput_m3 <- simulateResiduals(shrub_model3)
plot(simulationOutput_m3)
par(mfrow = c(1,2))
testDispersion(simulationOutput_m3)
##
## DHARMa nonparametric dispersion test via sd of residuals fitted vs.
## simulated
##
## data: simulationOutput
## dispersion = 1.6709, p-value = 0.208
## alternative hypothesis: two.sided
testZeroInflation(simulationOutput_m3)
##
## DHARMa zero-inflation test via comparison to expected zeros with
## simulation under H0 = fitted model
##
## data: simulationOutput
## ratioObsSim = 1.4543, p-value = 0.024
## alternative hypothesis: two.sided
# check diagnostics
simulationOutput_m4 <- simulateResiduals(shrub_model4)
plot(simulationOutput_m4)
par(mfrow = c(1,2))
testDispersion(simulationOutput_m4)
##
## DHARMa nonparametric dispersion test via sd of residuals fitted vs.
## simulated
##
## data: simulationOutput
## dispersion = 0.80605, p-value = 0.576
## alternative hypothesis: two.sided
testZeroInflation(simulationOutput_m4)
##
## DHARMa zero-inflation test via comparison to expected zeros with
## simulation under H0 = fitted model
##
## data: simulationOutput
## ratioObsSim = 3.2154, p-value < 2.2e-16
## alternative hypothesis: two.sided
# check diagnostics
simulationOutput_m5 <- simulateResiduals(shrub_model5)
plot(simulationOutput_m5)
par(mfrow = c(1,2))
testDispersion(simulationOutput_m5)
##
## DHARMa nonparametric dispersion test via sd of residuals fitted vs.
## simulated
##
## data: simulationOutput
## dispersion = 1.6618, p-value = 0.192
## alternative hypothesis: two.sided
testZeroInflation(simulationOutput_m5)
##
## DHARMa zero-inflation test via comparison to expected zeros with
## simulation under H0 = fitted model
##
## data: simulationOutput
## ratioObsSim = 1.4583, p-value = 0.04
## alternative hypothesis: two.sided
Breakdown of models 1-5:
| Model | Formula | Distribution | QQ Plot Residuals | Residual
vs. Predicted | Over/Under Dispersion | Zeros | |—|—|—|—|—|—|—| |
shrub_model1 | num_seeds ~ treatment + species + total_inf | Normal | F
| F | Under | Too many | | shrub_model2 | num_seeds ~ treatment +
species + total_inf | Poisson | F | F | Over | Too many | | shrub_model3
| num_seeds ~ treatment + species + total_inf | Negative Binomial | P |
F | None | Too many | | shrub_model4 | num_seeds ~ treatment + species +
total_inf + (1|shrub_num) | Poisson | F | F | Over | Too many | |
shrub_model5 | num_seeds ~ treatment + species + total_inf +
(1|shrub_num) | Negative Binomial | P | F | None | Too many |
Based on the tests above, it appears that either shrub_model3 or shrub_model5 will be the best models because neither are over or under-dispered. However, they do still have some patterns in the residuals, and they appear to be zero-inflated. To further our model, we will create a new model with a zero-inflated term.
# generalized linear model with negative binomial distribution
shrub_model6 <- glmmTMB(num_seeds ~ treatment + species + total_inf, ziformula = ~1,
data = shrub_seed_sub, family = "nbinom2")
With the zero-inflated model created, we will now run the necessary diagnostics.
# check diagnostics
simulationOutput_m6 <- simulateResiduals(shrub_model6)
plot(simulationOutput_m6)
par(mfrow = c(1,2))
testDispersion(simulationOutput_m6)
##
## DHARMa nonparametric dispersion test via sd of residuals fitted vs.
## simulated
##
## data: simulationOutput
## dispersion = 1.7383, p-value = 0.36
## alternative hypothesis: two.sided
testZeroInflation(simulationOutput_m6)
##
## DHARMa zero-inflation test via comparison to expected zeros with
## simulation under H0 = fitted model
##
## data: simulationOutput
## ratioObsSim = 1.0537, p-value = 0.824
## alternative hypothesis: two.sided
Based on the above plots, shrub_model6 appears to be our best model. There is no over or underdispersion, and our distribution has about the same number of zeros as our model would predict. There is still some trend in our residuals, but it appears to have been lessened when compared to shrub_model3. We acknowledge this trend in residuals and have decided that isn’t a big cause for concern since the other diagnostics were met.
Moving forward, we will check AICC values to test whether our assumptions about the models are correct.
MuMIn::model.sel(shrub_model1, shrub_model2, shrub_model3, shrub_model4, shrub_model5, shrub_model6)
## Model selection table
## (Intr) spc ttl_inf trt cnd((Int)) dsp((Int)) cnd(spc) cnd(ttl_inf)
## shrub_model6 2.158 + + 0.06625
## shrub_model3 1.918 + + 0.07499
## shrub_model5 1.916 + + 0.07501
## shrub_model1 -2.567 + 2.15000 +
## shrub_model4 2.488 + + 0.03145
## shrub_model2 2.549 + 0.02989 +
## cnd(trt) zi((Int)) family class random df logLik AICc
## shrub_model6 + -2.281 n2(lg) glmmTMB 10 -553.169 1127.6
## shrub_model3 + n2(lg) glmmTMB 9 -556.455 1132.0
## shrub_model5 + n2(lg) glmmTMB c(s_n) 10 -556.282 1133.9
## shrub_model1 gs(id) lm 9 -692.468 1404.0
## shrub_model4 + ps(lg) glmmTMB c(s_n) 9 -1058.768 2136.6
## shrub_model2 ps(lg) glm 8 -1150.107 2317.0
## delta weight
## shrub_model6 0.00 0.863
## shrub_model3 4.33 0.099
## shrub_model5 6.23 0.038
## shrub_model1 276.36 0.000
## shrub_model4 1008.96 0.000
## shrub_model2 1189.42 0.000
## Abbreviations:
## family: gs(id) = 'gaussian(identity)', n2(lg) = 'nbinom2(log)',
## ps(lg) = 'poisson(log)'
## Models ranked by AICc(x)
## Random terms:
## c(s_n): cond(1 | shrub_num)
This chart confirms our assumptions about our models; shrub_model6 is best, followed by shrub_model3 and shrub_model5. We know this because of their respective AICC values of 1127.6, 1132.0, and 1133.9. In general, a lower AICC value equates to a better model.
Next, we will look closer at our model: shrub_model6.
# summary
summary(shrub_model6)
## Family: nbinom2 ( log )
## Formula: num_seeds ~ treatment + species + total_inf
## Zero inflation: ~1
## Data: shrub_seed_sub
##
## AIC BIC logLik deviance df.resid
## 1126.3 1158.4 -553.2 1106.3 172
##
##
## Dispersion parameter for nbinom2 family (): 2.21
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.158294 0.211515 10.204 < 2e-16 ***
## treatmentshrub -0.298882 0.131165 -2.279 0.0227 *
## speciesCARRUP -1.720761 0.280803 -6.128 8.9e-10 ***
## speciesGEUROS -0.239652 0.241272 -0.993 0.3206
## speciesKOBMYO 0.045647 0.199786 0.228 0.8193
## speciesMINOBT -0.416863 0.210918 -1.976 0.0481 *
## speciesTRIDAS 1.486501 0.724587 2.052 0.0402 *
## total_inf 0.066248 0.007463 8.877 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Zero-inflation model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.2813 0.3944 -5.784 7.3e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The summary of our model shows that treatmentshrub, speciesCARRUP, speciesMINOBT, speciesTRIDAS, and total_inf all appears to be significant indicators of total number of seeds.
# confidence intervals
confint(shrub_model6)
## 2.5 % 97.5 % Estimate
## cond.(Intercept) 1.74373255 2.572855217 2.15829389
## cond.treatmentshrub -0.55596148 -0.041802243 -0.29888186
## cond.speciesCARRUP -2.27112456 -1.170397104 -1.72076083
## cond.speciesGEUROS -0.71253699 0.233232924 -0.23965203
## cond.speciesKOBMYO -0.34592667 0.437221033 0.04564718
## cond.speciesMINOBT -0.83025460 -0.003471633 -0.41686312
## cond.speciesTRIDAS 0.06633737 2.906665530 1.48650145
## cond.total_inf 0.05162033 0.080875060 0.06624770
## zi.(Intercept) -3.05433608 -1.508209420 -2.28127275
# adjusted R2
r.squaredGLMM(shrub_model6)
## R2m R2c
## delta 0.7256489 0.7256489
## lognormal 0.7667371 0.7667371
## trigamma 0.6697461 0.6697461
# model object in table
shrub_model6 |>
as_flextable()
Estimate | Standard Error | statistic | p-value | ||||
|---|---|---|---|---|---|---|---|
Fixed effects | |||||||
(Intercept) | 2.158 | 0.212 | 10.204 | 0.0000 | *** | ||
treatmentshrub | -0.299 | 0.131 | -2.279 | 0.0227 | * | ||
speciesCARRUP | -1.721 | 0.281 | -6.128 | 0.0000 | *** | ||
speciesGEUROS | -0.240 | 0.241 | -0.993 | 0.3206 |
| ||
speciesKOBMYO | 0.046 | 0.200 | 0.228 | 0.8193 |
| ||
speciesMINOBT | -0.417 | 0.211 | -1.976 | 0.0481 | * | ||
speciesTRIDAS | 1.487 | 0.725 | 2.052 | 0.0402 | * | ||
total_inf | 0.066 | 0.007 | 8.877 | 0.0000 | *** | ||
(Intercept) | -2.281 | 0.394 | -5.784 | 0.0000 | *** | ||
Signif. codes: 0 <= '***' < 0.001 < '**' < 0.01 < '*' < 0.05 | |||||||
square root of the estimated residual variance: 2.2 | |||||||
data's log-likelihood under the model: -553.2 | |||||||
Akaike Information Criterion: 1,126.3 | |||||||
Bayesian Information Criterion: 1,158.4 | |||||||
The above displays what we studied earlier in the summary, just in an easier to digest format.
plot(allEffects(shrub_model6))
To answer the question we are studying, how does seed count vary with plot type (shrub or open), plant species, and total number of inflorescences, we need to look at the effects of each of the variables. This plot shows that the number of seeds is generally higher in the open (control) plot. Additionally, the species TRIDAS tends to have the highest number of seeds, whereas CARRUP appears to have the lowest amount of seeds. Lastly, the number of seeds increases as the number of total inflorescenses increases.